
The ideal machine learning project involves general flow analysis stages for building a Predictive Model. Steps followed to perform data analysis:
Question: why not a model to predict if a project will be successful, failed or cancelled based on given dataset?
List of possible predicting factors:
%reset -f
#Load Pre-requisits
import sys
import os
import math
import pickle
import matplotlib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import warnings
warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)
## Visualization libraries
import plotly.tools as tls
import plotly.offline as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
from collections import Counter
##Text Processing
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import string
import re
stop_words = set(stopwords.words('english'))
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#Feature Selection/Elimination
import statsmodels.formula.api as sm
from sklearn.feature_selection import RFE
from sklearn.linear_model import LassoCV
#Bagging and Boosting Algorithms, Evaluation Metric
!pip install imblearn
!pip install scipy
from imblearn.over_sampling import ADASYN
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
#Algos
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier ##SKLearn GBM - slower
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
import lightgbm as lgb
from sklearn.ensemble import VotingClassifier
##DR Tools
from sklearn.decomposition import PCA, TruncatedSVD, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
##Hyper
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from pprint import pprint
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\hmnsh\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\hmnsh\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date!
Requirement already satisfied: imblearn in c:\programdata\anaconda3\lib\site-packages (0.0) Requirement already satisfied: imbalanced-learn in c:\programdata\anaconda3\lib\site-packages (from imblearn) (0.4.3) Requirement already satisfied: scipy>=0.13.3 in c:\programdata\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.1.0) Requirement already satisfied: scikit-learn>=0.20 in c:\programdata\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (0.20.3) Requirement already satisfied: numpy>=1.8.2 in c:\programdata\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.16.2) Requirement already satisfied: scipy in c:\programdata\anaconda3\lib\site-packages (1.1.0)
- failed 52.22
- successful 35.38
- canceled 10.24
- undefined 0.94
- live 0.74
- suspended 0.49
Cancelled State There are 10% of projects in this dataset are in cancelled state. Since there is no clear reason given in this dataset for Project to get cancelled or no date on which it got cancelled. here, Canceled state should be considered as separate state and not failed.
For Example, It Could be project owner getting funding from somewhere else or the project requirements changed which let him recreate online crowd funding campaign.
print ("Total Projects: ", df_ks.shape[0], "\nTotal Features: ", df_ks.shape[1])
df_ks.head()
Total Projects: 378661 Total Features: 15
| ID | name | category | main_category | currency | deadline | goal | launched | pledged | state | backers | country | usd pledged | usd_pledged_real | usd_goal_real | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1000002330 | The Songs of Adelaide & Abullah | Poetry | Publishing | GBP | 2015-10-09 | 1000.0 | 2015-08-11 12:12:28 | 0.0 | failed | 0 | GB | 0.0 | 0.0 | 1533.95 |
| 1 | 1000003930 | Greeting From Earth: ZGAC Arts Capsule For ET | Narrative Film | Film & Video | USD | 2017-11-01 | 30000.0 | 2017-09-02 04:43:57 | 2421.0 | failed | 15 | US | 100.0 | 2421.0 | 30000.00 |
| 2 | 1000004038 | Where is Hank? | Narrative Film | Film & Video | USD | 2013-02-26 | 45000.0 | 2013-01-12 00:20:50 | 220.0 | failed | 3 | US | 220.0 | 220.0 | 45000.00 |
| 3 | 1000007540 | ToshiCapital Rekordz Needs Help to Complete Album | Music | Music | USD | 2012-04-16 | 5000.0 | 2012-03-17 03:24:11 | 1.0 | failed | 1 | US | 1.0 | 1.0 | 5000.00 |
| 4 | 1000011046 | Community Film Project: The Art of Neighborhoo... | Film & Video | Film & Video | USD | 2015-08-29 | 19500.0 | 2015-07-04 08:35:03 | 1283.0 | canceled | 14 | US | 1283.0 | 1283.0 | 19500.00 |
Note: Name column has 4 Nan whereas usd pledged is got 3797 NaN values. This rows can be directly removed as dataset is big enough to perfrom analysis.
def data_clean(df_ks):
df_ks = df_ks.dropna() ## Drop the rows where at least one element is missing.
df_ks = df_ks[df_ks["state"].isin(["failed", "successful", 'canceled'])] ## State - Successful and Failed
df_ks = df_ks.drop(["ID", "currency", "pledged", "usd pledged", "goal"], axis = 1) ##Drop not useful columns
df_ks = df_ks[df_ks['usd_goal_real']< 2200000] # Remove noise from the data
return df_ks
print("Before Cleaning:", df_ks.shape)
df_clean = data_clean(df_ks)
print("After Cleaning:", df_clean.shape)
del df_ks ## releasing system memory
gc.collect()
Before Cleaning: (378661, 15) After Cleaning: (369678, 10)
542
df_clean['Goal(USD Millions)'] = (df_clean['usd_goal_real'].astype(float)/1000000).astype(float)
df_clean['Pledged(USD Millions)'] = (df_clean['usd_pledged_real'].astype(float)/1000000).astype(float)
plt.figure(figsize=(12,6))
plt.suptitle('(Exploration) Goal vs Pledged Amount', fontsize=24)
#plt.annotate('After approximate 2000000 goal, none of them were successfull(Noise)', xy=(650000, 960000), xytext=(600000, 840000),arrowprops=dict(facecolor='black', shrink=0.05))
sns.set_style('whitegrid')
sns.set(font_scale=1.4)
ax = sns.scatterplot(x="Goal(USD Millions)", y="Pledged(USD Millions)", s=130, hue='state' , data=df_clean)
plt.show()
df_clean = df_clean.drop(["Goal(USD Millions)", "Pledged(USD Millions)"], axis = 1)
Numeric variables such as backers, usd_pledged_real, usd_goal_real are higly right skewed because of so many failed instances not having single backers or pledged amount raised. This will be addressed through data normalization while developing a model.
To explore these data it needs to be transformed and then histogram should be created to visualize distributions.
| skew | goal_real - 12.765938 | Pledged_real - 82.063085 | backers - 86.294188 |
| Column | usd_goal_real_log | usd_pledged_real_log |
|---|---|---|
| count | 369678.000000 | 369678.000000 |
| mean | 8.632460 | 5.775453 |
| std | 1.671539 | 3.309677 |
| min | 0.009950 | 0.000000 |
| 25% | 7.601402 | 3.526361 |
| 50% | 8.612685 | 6.456770 |
| 75% | 9.662097 | 8.314587 |
| max | 14.591996 | 16.828050 |
Minimum goal amount is as small as 0.01
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
#General Stats
df_clean["usd_goal_real_log"] = np.log(df_clean.usd_goal_real+1)
df_clean["usd_pledged_real_log"] = np.log(df_clean.usd_pledged_real+1)
#df_clean["backers_log"] = np.log(df_clean.backers+1)
# drop by Name
df1 = df_clean.drop(['usd_goal_real', 'usd_pledged_real', 'backers'], axis=1)
#print (df1.describe())
del df1
df_clean.drop(['usd_goal_real_log', 'usd_pledged_real_log'], axis=1, inplace = True)
gc.collect()
#print("Minimum goal amount is as small as 0.01")
#configure_plotly_browser_state()
df_cancel = df_clean[df_clean["state"] == "canceled"]
df_failed = df_clean[df_clean["state"] == "failed"]
df_sucess = df_clean[df_clean["state"] == "successful"]
#First plot
trace0 = go.Histogram(
x= np.log(df_clean.usd_goal_real+1),
histnorm='probability', showlegend=False,
xbins=dict(
start=-5.0,
end=19.0,
size=1),
autobiny=True)
#Second plot
trace1 = go.Histogram(
x = np.log(df_clean.usd_pledged_real+1),
histnorm='probability', showlegend=False,
xbins=dict(
start=-1.0,
end=17.0,
size=1))
# Add histogram data
x1 = np.log(df_failed['usd_goal_real']+1)
x2 = np.log(df_sucess["usd_goal_real"]+1)
x3 = np.log(df_cancel["usd_goal_real"]+1)
trace3 = go.Histogram(
x=x1,
opacity=0.60, nbinsx=30, name='Goals Failed', histnorm='probability'
)
trace4 = go.Histogram(
x=x2,
opacity=0.60, nbinsx=30, name='Goals Sucessful', histnorm='probability'
)
trace5 = go.Histogram(
x=x3,
opacity=0.60, nbinsx=30, name='Goals Cancelled', histnorm='probability'
)
data = [trace0, trace1, trace3, trace4, trace5]
layout = go.Layout(barmode='overlay')
#Creating the grid
fig = tls.make_subplots(rows=2, cols=2, specs=[ [{'colspan': 2}, None], [{}, {}]],
subplot_titles=('Failed, Cancelled and Sucessful Projects',
'Goal','Pledged'))
#setting the figs
fig.append_trace(trace0, 2, 1)
fig.append_trace(trace1, 2, 2)
fig.append_trace(trace3, 1, 1)
fig.append_trace(trace4, 1, 1)
fig.append_trace(trace5, 1, 1)
fig['layout'].update(title="(Data Exploration) Log Transformed Distribuitions",
height=500, width=900, barmode='overlay')
iplot(fig)
This is the format of your plot grid: [ (1,1) x1,y1 - ] [ (2,1) x2,y2 ] [ (2,2) x3,y3 ]